{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "surrounded-corner",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "empirical-perception",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "willing-blanket",
   "metadata": {},
   "outputs": [],
   "source": [
    "import selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "muslim-jason",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "import time\n",
    "\n",
    "wd = webdriver.Chrome()\n",
    "wd.get(\"https://www.baidu.com\")    # 打开百度浏览器\n",
    "wd.find_element_by_id(\"kw\").send_keys(\"selenium\")   # 定位输入框并输入关键字\n",
    "wd.find_element_by_id(\"su\").click()   #点击[百度一下]搜索\n",
    "time.sleep(3)   #等待3秒\n",
    "wd.quit()   #关闭浏览器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "historical-polyester",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "noticed-spiritual",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-104-5f738edac08d>:15: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox') #解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "brave-turkish",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://www.cnki.net\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "automated-tobago",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击高级检索\n",
    "element = driver.find_element_by_xpath('//*[@id=\"highSearch\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "inappropriate-stranger",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-147163A690285E5E38ED44B4585477B4',\n",
       " 'CDwindow-1EB28E536114C04C8AB041AE8ED2E402']"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查窗口（当出现多个窗口时，一定要先检查窗口位置；每个窗口在driver中自动生成唯一窗口id）\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "ruled-staff",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-108-704fdf3805c1>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "robust-beast",
   "metadata": {},
   "outputs": [],
   "source": [
    " # 点击期刊\n",
    "driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "underlying-street",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 勾选期刊\n",
    "driver.find_element_by_xpath('//input[@key=\"CSI\"]').click()\n",
    "driver.find_element_by_xpath('//input[@key=\"HX\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "minimal-fiber",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//input[@key=\"SI\"]').click()\n",
    "driver.find_element_by_xpath('//input[@key=\"EI\"]').click()\n",
    "driver.find_element_by_xpath('//input[@key=\"CSD\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "convertible-program",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填写索引关键词（可以在高级检索直接检索【只需要不精确的】；建议专业检索）\n",
    "driver.find_element_by_xpath('//*[@id=\"gradetxt\"]/dd[1]/div[2]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "greater-intranet",
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"keyword\": \"网络与新媒体\"}\n",
    "driver.find_element_by_xpath('//*[@id=\"gradetxt\"]/dd[1]/div[2]/input').send_keys(payload['keyword'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "mental-douglas",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 搜索\n",
    "driver.find_element_by_xpath('/html/body/div[4]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "sonic-eagle",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 专业检索\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/ul/li[4]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "killing-network",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'SU=主题,TKA=篇关摘,TI=篇名,KY=关键词,AB=摘要,CO=小标题,FT=全文,AU=作者,FI=第一作者,RP=通讯作者,AF=作者单位,LY=期刊名称,RF=参考文献,FU=基金,             CLC=中图分类号,     SN=ISSN,CN=CN,   DOI=DOI,QKLM=栏目信息,FAF=第一单位,CF=被引频次'"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[2]/dl/dd[1]/p').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "occupational-recognition",
   "metadata": {},
   "outputs": [],
   "source": [
    "AI_新媒体_query= '(SU=\"新媒体\" and SU=\"人工智能\") OR (SU=\"AI\" and KY=\"新媒体\") OR (TI=\"机器学习\" and KY=\"新媒体\")'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "featured-malawi",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//textarea').clear()\n",
    "driver.find_element_by_xpath('//textarea').send_keys(AI_新媒体_query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "confidential-consultancy",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击检索\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "satisfied-walter",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击页面显示50篇\n",
    "# 1、显示\n",
    "driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/div/i').click()\n",
    "# 2、50\n",
    "driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/ul/li[3]').click()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "processed-sauce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>新文艺评论的事态、意态与情态</td>\n",
       "      <td>黄鸣奋</td>\n",
       "      <td>中国文艺评论</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体时代城市旅游形象的传播与推广研究</td>\n",
       "      <td>王艳</td>\n",
       "      <td>黑河学刊</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>163.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>融媒体新时代媒体宣传管理中的涌现秩序问题</td>\n",
       "      <td>王春宏</td>\n",
       "      <td>老字号品牌营销</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>“人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例</td>\n",
       "      <td>巩晗</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>77.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>基于人工智能技术的新媒体交互艺术表达设计</td>\n",
       "      <td>许洋洋</td>\n",
       "      <td>自动化技术与应用</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>新媒体技术条件下辽宁传统媒体定位创新研究</td>\n",
       "      <td>王旭</td>\n",
       "      <td>新闻研究导刊</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>新媒体时代人工智能作品的著作权法地位探讨——评《著作权法前沿热点问题探究》</td>\n",
       "      <td>杜菁</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-04-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>181.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>播音主持应对人工智能的策略与思考</td>\n",
       "      <td>胡未央</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>39.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>一纸风行到一端在手——《华西都市报》的转型探析</td>\n",
       "      <td>刘洁</td>\n",
       "      <td>出版广角</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>37.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>人工智能与新媒体传播双重视域下高校美育实践的改革创新</td>\n",
       "      <td>张建; 高尚</td>\n",
       "      <td>绵阳师范学院学报</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>新媒体艺术：艺术借助科技拓展无边疆域</td>\n",
       "      <td>NaN</td>\n",
       "      <td>上海艺术评论</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>117.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>科技类图书传统纸媒与新媒体结合的探索</td>\n",
       "      <td>刘利平</td>\n",
       "      <td>西部广播电视</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>海南党报融媒体的创意转型之路</td>\n",
       "      <td>臧晓丹</td>\n",
       "      <td>新媒体研究</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>人工智能主播的应用策略</td>\n",
       "      <td>王梦颖; 李怀苍</td>\n",
       "      <td>宁夏师范学院学报</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>61.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>智能导播助力2021春晚新媒体节目创新——浅析人工智能切换技术的应用</td>\n",
       "      <td>陈戈</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>44.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>智媒时代新媒体现状及提升路径探索</td>\n",
       "      <td>李红红</td>\n",
       "      <td>新闻传播</td>\n",
       "      <td>2021-03-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>287.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>浙报融媒体:破圈重构的思路与做法</td>\n",
       "      <td>王琳; 郑莉</td>\n",
       "      <td>传媒评论</td>\n",
       "      <td>2021-02-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>新媒体传播研究十大前沿话题</td>\n",
       "      <td>谭天; 初令伟</td>\n",
       "      <td>媒体融合新观察</td>\n",
       "      <td>2021-02-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>177.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>闪开！数据“裸奔时代”</td>\n",
       "      <td>NaN</td>\n",
       "      <td>商学院</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>新媒体背景下专业教材出版思路研究——基于人工智能专业教材出版实践</td>\n",
       "      <td>祝智敏; 李晓雨; 吴振宇</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>基于人工智能的传媒企业发展探析</td>\n",
       "      <td>孙芳</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>130.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>新媒体艺术再思考——“林茨电子艺术节”41年主题流变</td>\n",
       "      <td>蔡新元</td>\n",
       "      <td>美术观察</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>AI based research on exploration and innovatio...</td>\n",
       "      <td>Chen Yanjie;Zheng Na;Saravanan Vijayalakshmi</td>\n",
       "      <td>Journal of Intelligent &amp; Fuzzy Systems</td>\n",
       "      <td>2021-02-02</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>论5G技术下“互联网+”高校工会管理服务模式的创新</td>\n",
       "      <td>张继芳</td>\n",
       "      <td>信息记录材料</td>\n",
       "      <td>2021-02-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>从“融”起来到“强”起来</td>\n",
       "      <td>何平</td>\n",
       "      <td>新闻潮</td>\n",
       "      <td>2021-01-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>数智与融合:5G科教新基建未来发展趋势和路径选择</td>\n",
       "      <td>曹三省; 胡倩倩</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>58.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>《科技日报》人工智能报道研究</td>\n",
       "      <td>赖晨璐; 陶贤都</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>人工智能媒介下文学创作商业走向分析</td>\n",
       "      <td>李娜</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-01-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>83.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>2020年网络新媒体传播:重大现实主题与学科研究进展</td>\n",
       "      <td>孟威</td>\n",
       "      <td>当代传播</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>467.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>总台技术局一行参观调研中视广信天津研发中心</td>\n",
       "      <td>NaN</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>全媒体战略下电视内容生产的创新研究</td>\n",
       "      <td>仝文瑶; 邓璐; 邢毓雯</td>\n",
       "      <td>中国电视</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>395.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>科技与艺术的交响——从“非物质/再物质:计算机艺术简史展”看新媒体艺术的历史和未来</td>\n",
       "      <td>李超</td>\n",
       "      <td>艺术当代</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>87.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>广告智能化研究的知识图谱</td>\n",
       "      <td>段淳林; 崔钰婷</td>\n",
       "      <td>新闻与传播评论</td>\n",
       "      <td>2021-01-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>491.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>工会系统运用新一代信息技术服务疫情防控和复工复产的现状、问题与对策</td>\n",
       "      <td>谢永胜</td>\n",
       "      <td>工会信息</td>\n",
       "      <td>2021-01-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>中国数字媒体研究现状、问题及趋势</td>\n",
       "      <td>黄楚新; 朱常华</td>\n",
       "      <td>新闻论坛</td>\n",
       "      <td>2020-12-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>386.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>浅谈人工智能在新闻生产领域的应用——以中央广播电视总台为例</td>\n",
       "      <td>罗娜</td>\n",
       "      <td>新闻战线</td>\n",
       "      <td>2020-12-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>疫情下国际性会议融媒体传播机制——以2020世界人工智能大会云端峰会为例</td>\n",
       "      <td>陈实</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>智媒时代新媒体创新与发展</td>\n",
       "      <td>李红红</td>\n",
       "      <td>理论学习与探索</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>89.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>中共中央提议制定完善对网络直播、自媒体等新媒体业态和算法推荐、深度伪造等新技术应用的规范管理办法</td>\n",
       "      <td>NaN</td>\n",
       "      <td>中国广播</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>293.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>论媒介融合背景下播音主持人才转型路径</td>\n",
       "      <td>包晓曼</td>\n",
       "      <td>文化产业</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>智能语音识别技术在闽南语广播电视节目智慧监管中的应用研究</td>\n",
       "      <td>郑晔; 欧智坚; 杨艇</td>\n",
       "      <td>广播与电视技术</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>44.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>四位一体，融创合一：我国媒体融合正式迈入3.0时代</td>\n",
       "      <td>王彦琦; 张海</td>\n",
       "      <td>出版广角</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>智媒时代深度报道的问题与出路</td>\n",
       "      <td>吴诗晨</td>\n",
       "      <td>新闻前哨</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>142.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>一种高质量音视频转码压缩技术在电视新媒体领域的应用</td>\n",
       "      <td>封连伟</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>53.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>人工智能在短视频领域中的应用趋势</td>\n",
       "      <td>朱琦</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>136.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>The Impact and Integration of AI Journalists o...</td>\n",
       "      <td>Feiman Liu</td>\n",
       "      <td>International Journal of Intelligent Informati...</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>融媒体环境下传统媒体与新媒体深度融合探讨</td>\n",
       "      <td>陈启芳</td>\n",
       "      <td>中国地市报人</td>\n",
       "      <td>2020-12-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>92.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>媒体融合发展:回首“十三五”展望“十四五”</td>\n",
       "      <td>黄楚新; 朱常华; 邵赛男</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-10</td>\n",
       "      <td>1.0</td>\n",
       "      <td>260.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>新媒体技术在新闻采编业务中的运用分析</td>\n",
       "      <td>王泽勇</td>\n",
       "      <td>西部广播电视</td>\n",
       "      <td>2020-12-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>论新媒体技术在新闻采编业务中的运用</td>\n",
       "      <td>李朝敏</td>\n",
       "      <td>西部广播电视</td>\n",
       "      <td>2020-12-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                 篇名  \\\n",
       "0            1                                     新文艺评论的事态、意态与情态   \n",
       "1            2                                新媒体时代城市旅游形象的传播与推广研究   \n",
       "2            3                               融媒体新时代媒体宣传管理中的涌现秩序问题   \n",
       "3            4             “人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例   \n",
       "4            5                               基于人工智能技术的新媒体交互艺术表达设计   \n",
       "5            6                               新媒体技术条件下辽宁传统媒体定位创新研究   \n",
       "6            7              新媒体时代人工智能作品的著作权法地位探讨——评《著作权法前沿热点问题探究》   \n",
       "7            8                                   播音主持应对人工智能的策略与思考   \n",
       "8            9                            一纸风行到一端在手——《华西都市报》的转型探析   \n",
       "9           10                         人工智能与新媒体传播双重视域下高校美育实践的改革创新   \n",
       "10          11                                 新媒体艺术：艺术借助科技拓展无边疆域   \n",
       "11          12                                 科技类图书传统纸媒与新媒体结合的探索   \n",
       "12          13                                     海南党报融媒体的创意转型之路   \n",
       "13          14                                        人工智能主播的应用策略   \n",
       "14          15                 智能导播助力2021春晚新媒体节目创新——浅析人工智能切换技术的应用   \n",
       "15          16                                   智媒时代新媒体现状及提升路径探索   \n",
       "16          17                                   浙报融媒体:破圈重构的思路与做法   \n",
       "17          18                                      新媒体传播研究十大前沿话题   \n",
       "18          19                                        闪开！数据“裸奔时代”   \n",
       "19          20                   新媒体背景下专业教材出版思路研究——基于人工智能专业教材出版实践   \n",
       "20          21                                    基于人工智能的传媒企业发展探析   \n",
       "21          22                         新媒体艺术再思考——“林茨电子艺术节”41年主题流变   \n",
       "22          23  AI based research on exploration and innovatio...   \n",
       "23          24                          论5G技术下“互联网+”高校工会管理服务模式的创新   \n",
       "24          25                                       从“融”起来到“强”起来   \n",
       "25          26                           数智与融合:5G科教新基建未来发展趋势和路径选择   \n",
       "26          27                                     《科技日报》人工智能报道研究   \n",
       "27          28                                  人工智能媒介下文学创作商业走向分析   \n",
       "28          29                         2020年网络新媒体传播:重大现实主题与学科研究进展   \n",
       "29          30                              总台技术局一行参观调研中视广信天津研发中心   \n",
       "30          31                                  全媒体战略下电视内容生产的创新研究   \n",
       "31          32          科技与艺术的交响——从“非物质/再物质:计算机艺术简史展”看新媒体艺术的历史和未来   \n",
       "32          33                                       广告智能化研究的知识图谱   \n",
       "33          34                  工会系统运用新一代信息技术服务疫情防控和复工复产的现状、问题与对策   \n",
       "34          35                                   中国数字媒体研究现状、问题及趋势   \n",
       "35          36                      浅谈人工智能在新闻生产领域的应用——以中央广播电视总台为例   \n",
       "36          37               疫情下国际性会议融媒体传播机制——以2020世界人工智能大会云端峰会为例   \n",
       "37          38                                       智媒时代新媒体创新与发展   \n",
       "38          39   中共中央提议制定完善对网络直播、自媒体等新媒体业态和算法推荐、深度伪造等新技术应用的规范管理办法   \n",
       "39          40                                 论媒介融合背景下播音主持人才转型路径   \n",
       "40          41                       智能语音识别技术在闽南语广播电视节目智慧监管中的应用研究   \n",
       "41          42                          四位一体，融创合一：我国媒体融合正式迈入3.0时代   \n",
       "42          43                                     智媒时代深度报道的问题与出路   \n",
       "43          44                          一种高质量音视频转码压缩技术在电视新媒体领域的应用   \n",
       "44          45                                   人工智能在短视频领域中的应用趋势   \n",
       "45          46  The Impact and Integration of AI Journalists o...   \n",
       "46          47                               融媒体环境下传统媒体与新媒体深度融合探讨   \n",
       "47          48                              媒体融合发展:回首“十三五”展望“十四五”   \n",
       "48          49                                 新媒体技术在新闻采编业务中的运用分析   \n",
       "49          50                                  论新媒体技术在新闻采编业务中的运用   \n",
       "\n",
       "                                              作者  \\\n",
       "0                                            黄鸣奋   \n",
       "1                                             王艳   \n",
       "2                                            王春宏   \n",
       "3                                             巩晗   \n",
       "4                                            许洋洋   \n",
       "5                                             王旭   \n",
       "6                                             杜菁   \n",
       "7                                            胡未央   \n",
       "8                                             刘洁   \n",
       "9                                         张建; 高尚   \n",
       "10                                           NaN   \n",
       "11                                           刘利平   \n",
       "12                                           臧晓丹   \n",
       "13                                      王梦颖; 李怀苍   \n",
       "14                                            陈戈   \n",
       "15                                           李红红   \n",
       "16                                        王琳; 郑莉   \n",
       "17                                       谭天; 初令伟   \n",
       "18                                           NaN   \n",
       "19                                 祝智敏; 李晓雨; 吴振宇   \n",
       "20                                            孙芳   \n",
       "21                                           蔡新元   \n",
       "22  Chen Yanjie;Zheng Na;Saravanan Vijayalakshmi   \n",
       "23                                           张继芳   \n",
       "24                                            何平   \n",
       "25                                      曹三省; 胡倩倩   \n",
       "26                                      赖晨璐; 陶贤都   \n",
       "27                                            李娜   \n",
       "28                                            孟威   \n",
       "29                                           NaN   \n",
       "30                                  仝文瑶; 邓璐; 邢毓雯   \n",
       "31                                            李超   \n",
       "32                                      段淳林; 崔钰婷   \n",
       "33                                           谢永胜   \n",
       "34                                      黄楚新; 朱常华   \n",
       "35                                            罗娜   \n",
       "36                                            陈实   \n",
       "37                                           李红红   \n",
       "38                                           NaN   \n",
       "39                                           包晓曼   \n",
       "40                                   郑晔; 欧智坚; 杨艇   \n",
       "41                                       王彦琦; 张海   \n",
       "42                                           吴诗晨   \n",
       "43                                           封连伟   \n",
       "44                                            朱琦   \n",
       "45                                    Feiman Liu   \n",
       "46                                           陈启芳   \n",
       "47                                 黄楚新; 朱常华; 邵赛男   \n",
       "48                                           王泽勇   \n",
       "49                                           李朝敏   \n",
       "\n",
       "                                                   刊名        发表时间   被引     下载  \\\n",
       "0                                              中国文艺评论  2021-05-25  NaN   23.0   \n",
       "1                                                黑河学刊  2021-05-20  NaN  163.0   \n",
       "2                                             老字号品牌营销  2021-05-10  NaN   34.0   \n",
       "3                                                青年记者  2021-04-30  NaN   77.0   \n",
       "4                                            自动化技术与应用  2021-04-25  NaN  120.0   \n",
       "5                                              新闻研究导刊  2021-04-25  NaN    4.0   \n",
       "6                                               新闻爱好者  2021-04-20  NaN  181.0   \n",
       "7                                                中国报业  2021-04-15  NaN   39.0   \n",
       "8                                                出版广角  2021-04-15  NaN   37.0   \n",
       "9                                            绵阳师范学院学报  2021-04-15  NaN  102.0   \n",
       "10                                             上海艺术评论  2021-04-15  NaN  117.0   \n",
       "11                                             西部广播电视  2021-03-25  NaN    9.0   \n",
       "12                                              新媒体研究  2021-03-25  NaN    6.0   \n",
       "13                                           宁夏师范学院学报  2021-03-15  NaN   61.0   \n",
       "14                                             现代电视技术  2021-03-15  NaN   44.0   \n",
       "15                                               新闻传播  2021-03-08  NaN  287.0   \n",
       "16                                               传媒评论  2021-02-25  NaN   34.0   \n",
       "17                                            媒体融合新观察  2021-02-25  NaN  177.0   \n",
       "18                                                商学院  2021-02-15  NaN   11.0   \n",
       "19                                             中国传媒科技  2021-02-15  NaN   27.0   \n",
       "20                                               传媒论坛  2021-02-07  NaN  130.0   \n",
       "21                                               美术观察  2021-02-05  NaN   73.0   \n",
       "22             Journal of Intelligent & Fuzzy Systems  2021-02-02  NaN    NaN   \n",
       "23                                             信息记录材料  2021-02-01  NaN   34.0   \n",
       "24                                                新闻潮  2021-01-28  NaN   25.0   \n",
       "25                                               科技传播  2021-01-25  NaN   58.0   \n",
       "26                                               科技传播  2021-01-25  NaN   38.0   \n",
       "27                                               中国报业  2021-01-22  NaN   83.0   \n",
       "28                                               当代传播  2021-01-15  NaN  467.0   \n",
       "29                                             现代电视技术  2021-01-15  NaN    6.0   \n",
       "30                                               中国电视  2021-01-15  NaN  395.0   \n",
       "31                                               艺术当代  2021-01-15  NaN   87.0   \n",
       "32                                            新闻与传播评论  2021-01-05  NaN  491.0   \n",
       "33                                               工会信息  2021-01-01  NaN   30.0   \n",
       "34                                               新闻论坛  2020-12-25  NaN  386.0   \n",
       "35                                               新闻战线  2020-12-23  NaN   21.0   \n",
       "36                                               青年记者  2020-12-20  NaN   84.0   \n",
       "37                                            理论学习与探索  2020-12-20  NaN   89.0   \n",
       "38                                               中国广播  2020-12-20  NaN  293.0   \n",
       "39                                               文化产业  2020-12-20  NaN   54.0   \n",
       "40                                            广播与电视技术  2020-12-15  NaN   44.0   \n",
       "41                                               出版广角  2020-12-15  2.0  134.0   \n",
       "42                                               新闻前哨  2020-12-15  1.0  142.0   \n",
       "43                                             现代电视技术  2020-12-15  1.0   53.0   \n",
       "44                                             中国传媒科技  2020-12-15  NaN  136.0   \n",
       "45  International Journal of Intelligent Informati...  2020-12-15  NaN    NaN   \n",
       "46                                             中国地市报人  2020-12-10  NaN   92.0   \n",
       "47                                               青年记者  2020-12-10  1.0  260.0   \n",
       "48                                             西部广播电视  2020-12-05  NaN   33.0   \n",
       "49                                             西部广播电视  2020-12-05  NaN    9.0   \n",
       "\n",
       "     操作  \n",
       "0    下载  \n",
       "1    下载  \n",
       "2    下载  \n",
       "3    下载  \n",
       "4    下载  \n",
       "5    下载  \n",
       "6    下载  \n",
       "7    下载  \n",
       "8    下载  \n",
       "9    下载  \n",
       "10   下载  \n",
       "11   下载  \n",
       "12   下载  \n",
       "13   下载  \n",
       "14   下载  \n",
       "15   下载  \n",
       "16   下载  \n",
       "17   下载  \n",
       "18   下载  \n",
       "19   下载  \n",
       "20   下载  \n",
       "21   下载  \n",
       "22  NaN  \n",
       "23   下载  \n",
       "24   下载  \n",
       "25   下载  \n",
       "26   下载  \n",
       "27   下载  \n",
       "28   下载  \n",
       "29   下载  \n",
       "30   下载  \n",
       "31   下载  \n",
       "32   下载  \n",
       "33   下载  \n",
       "34   下载  \n",
       "35   下载  \n",
       "36   下载  \n",
       "37   下载  \n",
       "38   下载  \n",
       "39   下载  \n",
       "40   下载  \n",
       "41   下载  \n",
       "42   下载  \n",
       "43   下载  \n",
       "44   下载  \n",
       "45  NaN  \n",
       "46   下载  \n",
       "47   下载  \n",
       "48   下载  \n",
       "49   下载  "
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 获取\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "page_html = element.get_attribute('innerHTML')\n",
    "首页主要数据 = pd.read_html(page_html)[0]\n",
    "首页主要数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "spiritual-plain",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "unique-bullet",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1/12'"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//span[@class=\"countPageMark\"]')\n",
    "page_str = element.get_attribute('innerHTML')\n",
    "page_str "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "color-making",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['1', '12']"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "page_int = page_str.split('/')\n",
    "page_int"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "velvet-swiss",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(1,int(page_int[1])+1))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "mineral-despite",
   "metadata": {},
   "outputs": [],
   "source": [
    "pages = list(range(1,13))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "usual-functionality",
   "metadata": {},
   "outputs": [],
   "source": [
    "表格_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "dental-synthesis",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 函数（翻页）\n",
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        # 定位到“下一页”的按钮 ——> 点击\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 设定休息的时间 ——> 避免爬虫被禁报错、以及出现验证码\n",
    "        time.sleep(30+20*random())\n",
    "        # 获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        表格_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "guilty-asbestos",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "stupid-george",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([表格_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "stopped-rebel",
   "metadata": {},
   "outputs": [],
   "source": [
    "网站 = \"中国知网\"\n",
    "# 指定内容输出的位置\n",
    "fn = { \"output\" : { \"htm_snippets\": \"data_raw_src/知网_htm_snippets_{网站}.tsv\"}\n",
    "     }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "fatty-tampa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存页面内容的csv文件\n",
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "confused-certificate",
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格 = pd.read_html(表格_html[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "dramatic-parts",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>新文艺评论的事态、意态与情态</td>\n",
       "      <td>黄鸣奋</td>\n",
       "      <td>中国文艺评论</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体时代城市旅游形象的传播与推广研究</td>\n",
       "      <td>王艳</td>\n",
       "      <td>黑河学刊</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>163.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>融媒体新时代媒体宣传管理中的涌现秩序问题</td>\n",
       "      <td>王春宏</td>\n",
       "      <td>老字号品牌营销</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>“人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例</td>\n",
       "      <td>巩晗</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>77.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>基于人工智能技术的新媒体交互艺术表达设计</td>\n",
       "      <td>许洋洋</td>\n",
       "      <td>自动化技术与应用</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>595</th>\n",
       "      <td>496</td>\n",
       "      <td>新兴技术在高等教育领域中的应用趋势研究——基于《新媒体联盟地平线报告》的解读与启示</td>\n",
       "      <td>孙掌印</td>\n",
       "      <td>高等教育研究学报</td>\n",
       "      <td>2017-12-15</td>\n",
       "      <td>6.0</td>\n",
       "      <td>350.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>596</th>\n",
       "      <td>497</td>\n",
       "      <td>信息消费时代公民理性精神的培养</td>\n",
       "      <td>顾理平</td>\n",
       "      <td>新闻战线</td>\n",
       "      <td>2017-12-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>319.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>597</th>\n",
       "      <td>498</td>\n",
       "      <td>大数据和AI技术在新媒体传播渠道中的应用分析</td>\n",
       "      <td>赵子蒙</td>\n",
       "      <td>新媒体研究</td>\n",
       "      <td>2017-12-05 13:45</td>\n",
       "      <td>8.0</td>\n",
       "      <td>830.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598</th>\n",
       "      <td>499</td>\n",
       "      <td>新媒体环境下SNS社交网络的品牌化运营分析</td>\n",
       "      <td>麦婕</td>\n",
       "      <td>科技经济导刊</td>\n",
       "      <td>2017-11-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>193.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>599</th>\n",
       "      <td>500</td>\n",
       "      <td>新媒体时代传统媒体的转型之道</td>\n",
       "      <td>郝佳丽</td>\n",
       "      <td>新闻传播</td>\n",
       "      <td>2017-11-23</td>\n",
       "      <td>4.0</td>\n",
       "      <td>75.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>650 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                         篇名   作者        刊名  \\\n",
       "0             1                             新文艺评论的事态、意态与情态  黄鸣奋    中国文艺评论   \n",
       "1             2                        新媒体时代城市旅游形象的传播与推广研究   王艳      黑河学刊   \n",
       "2             3                       融媒体新时代媒体宣传管理中的涌现秩序问题  王春宏   老字号品牌营销   \n",
       "3             4     “人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例   巩晗      青年记者   \n",
       "4             5                       基于人工智能技术的新媒体交互艺术表达设计  许洋洋  自动化技术与应用   \n",
       "..          ...                                        ...  ...       ...   \n",
       "595         496  新兴技术在高等教育领域中的应用趋势研究——基于《新媒体联盟地平线报告》的解读与启示  孙掌印  高等教育研究学报   \n",
       "596         497                            信息消费时代公民理性精神的培养  顾理平      新闻战线   \n",
       "597         498                     大数据和AI技术在新媒体传播渠道中的应用分析  赵子蒙     新媒体研究   \n",
       "598         499                      新媒体环境下SNS社交网络的品牌化运营分析   麦婕    科技经济导刊   \n",
       "599         500                             新媒体时代传统媒体的转型之道  郝佳丽      新闻传播   \n",
       "\n",
       "                 发表时间   被引     下载  操作  \n",
       "0          2021-05-25  NaN   23.0  下载  \n",
       "1          2021-05-20  NaN  163.0  下载  \n",
       "2          2021-05-10  NaN   34.0  下载  \n",
       "3          2021-04-30  NaN   77.0  下载  \n",
       "4          2021-04-25  NaN  120.0  下载  \n",
       "..                ...  ...    ...  ..  \n",
       "595        2017-12-15  6.0  350.0  下载  \n",
       "596        2017-12-08  NaN  319.0  下载  \n",
       "597  2017-12-05 13:45  8.0  830.0  下载  \n",
       "598        2017-11-25  1.0  193.0  下载  \n",
       "599        2017-11-23  4.0   75.0  下载  \n",
       "\n",
       "[650 rows x 8 columns]"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_总表 = 首页主要数据.append(df_url_out)\n",
    "df_总表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "neither-arthritis",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>新文艺评论的事态、意态与情态</td>\n",
       "      <td>黄鸣奋</td>\n",
       "      <td>中国文艺评论</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体时代城市旅游形象的传播与推广研究</td>\n",
       "      <td>王艳</td>\n",
       "      <td>黑河学刊</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>163.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>融媒体新时代媒体宣传管理中的涌现秩序问题</td>\n",
       "      <td>王春宏</td>\n",
       "      <td>老字号品牌营销</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>“人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例</td>\n",
       "      <td>巩晗</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>77.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>基于人工智能技术的新媒体交互艺术表达设计</td>\n",
       "      <td>许洋洋</td>\n",
       "      <td>自动化技术与应用</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>595</th>\n",
       "      <td>496</td>\n",
       "      <td>新兴技术在高等教育领域中的应用趋势研究——基于《新媒体联盟地平线报告》的解读与启示</td>\n",
       "      <td>孙掌印</td>\n",
       "      <td>高等教育研究学报</td>\n",
       "      <td>2017-12-15</td>\n",
       "      <td>6.0</td>\n",
       "      <td>350.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>596</th>\n",
       "      <td>497</td>\n",
       "      <td>信息消费时代公民理性精神的培养</td>\n",
       "      <td>顾理平</td>\n",
       "      <td>新闻战线</td>\n",
       "      <td>2017-12-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>319.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>597</th>\n",
       "      <td>498</td>\n",
       "      <td>大数据和AI技术在新媒体传播渠道中的应用分析</td>\n",
       "      <td>赵子蒙</td>\n",
       "      <td>新媒体研究</td>\n",
       "      <td>2017-12-05 13:45</td>\n",
       "      <td>8.0</td>\n",
       "      <td>830.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598</th>\n",
       "      <td>499</td>\n",
       "      <td>新媒体环境下SNS社交网络的品牌化运营分析</td>\n",
       "      <td>麦婕</td>\n",
       "      <td>科技经济导刊</td>\n",
       "      <td>2017-11-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>193.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>599</th>\n",
       "      <td>500</td>\n",
       "      <td>新媒体时代传统媒体的转型之道</td>\n",
       "      <td>郝佳丽</td>\n",
       "      <td>新闻传播</td>\n",
       "      <td>2017-11-23</td>\n",
       "      <td>4.0</td>\n",
       "      <td>75.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>650 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                         篇名   作者        刊名  \\\n",
       "0             1                             新文艺评论的事态、意态与情态  黄鸣奋    中国文艺评论   \n",
       "1             2                        新媒体时代城市旅游形象的传播与推广研究   王艳      黑河学刊   \n",
       "2             3                       融媒体新时代媒体宣传管理中的涌现秩序问题  王春宏   老字号品牌营销   \n",
       "3             4     “人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例   巩晗      青年记者   \n",
       "4             5                       基于人工智能技术的新媒体交互艺术表达设计  许洋洋  自动化技术与应用   \n",
       "..          ...                                        ...  ...       ...   \n",
       "595         496  新兴技术在高等教育领域中的应用趋势研究——基于《新媒体联盟地平线报告》的解读与启示  孙掌印  高等教育研究学报   \n",
       "596         497                            信息消费时代公民理性精神的培养  顾理平      新闻战线   \n",
       "597         498                     大数据和AI技术在新媒体传播渠道中的应用分析  赵子蒙     新媒体研究   \n",
       "598         499                      新媒体环境下SNS社交网络的品牌化运营分析   麦婕    科技经济导刊   \n",
       "599         500                             新媒体时代传统媒体的转型之道  郝佳丽      新闻传播   \n",
       "\n",
       "                 发表时间   被引     下载  操作  \n",
       "0          2021-05-25  NaN   23.0  下载  \n",
       "1          2021-05-20  NaN  163.0  下载  \n",
       "2          2021-05-10  NaN   34.0  下载  \n",
       "3          2021-04-30  NaN   77.0  下载  \n",
       "4          2021-04-25  NaN  120.0  下载  \n",
       "..                ...  ...    ...  ..  \n",
       "595        2017-12-15  6.0  350.0  下载  \n",
       "596        2017-12-08  NaN  319.0  下载  \n",
       "597  2017-12-05 13:45  8.0  830.0  下载  \n",
       "598        2017-11-25  1.0  193.0  下载  \n",
       "599        2017-11-23  4.0   75.0  下载  \n",
       "\n",
       "[650 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "with pd.ExcelWriter('知网文章数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_总表.to_excel(writer,sheet_name=\"知网\")\n",
    "display(df_总表)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "phantom-surprise",
   "metadata": {},
   "source": [
    "## 下载"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "running-pontiac",
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (<ipython-input-7-1ce885c12a73>, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-7-1ce885c12a73>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    import WebDriverWaitfrom selenium.webdriver.support\u001b[0m\n\u001b[0m                             ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriverfrom selenium.webdriver.support.ui \n",
    "import WebDriverWaitfrom selenium.webdriver.support \n",
    "import expected_conditions as ECfrom selenium.webdriver.common.by \n",
    "import Byimport timeimport jsonimport csv\n",
    "# 设置谷歌驱动器的环境\n",
    "options = webdriver.ChromeOptions()\n",
    "# 设置chrome不加载图片，提高速度\n",
    "options.add_experimental_option(\"prefs\", {\"profile.managed_default_content_settings.images\": 2})\n",
    "# 创建一个谷歌驱动器\n",
    "browser = webdriver.Chrome(options=options)url = 'http://wap.cnki.net/touch/web/guide'\n",
    "# 声明一个全局列表，用来存储字典\n",
    "data_list = []def start_spider(page):    \n",
    "# 请求url    \n",
    "browser.get(url)    \n",
    "# 显示等待输入框是否加载完成    \n",
    "WebDriverWait(browser, 1000).until(        \n",
    "    EC.presence_of_all_elements_located(            \n",
    "        (By.ID, 'keyword')        \n",
    "    )    )    \n",
    "# 找到输入框的id，并输入python关键字    \n",
    "browser.find_element_by_id('keyword').click()    \n",
    "browser.find_element_by_id('keyword_ordinary').send_keys('python')    \n",
    "# 输入关键字之后点击搜索    \n",
    "browser.find_element_by_class_name('btn-search ').click()    \n",
    "# print(browser.page_source)    \n",
    "# 显示等待文献是否加载完成    \n",
    "WebDriverWait(browser, 1000).until(        \n",
    "    EC.presence_of_all_elements_located(            \n",
    "        (By.CLASS_NAME, 'g-search-body')        \n",
    "    )    )    \n",
    "# 声明一个标记，用来标记翻页几页    \n",
    "count = 1    while True:      \n",
    "    # 显示等待加载更多按钮加载完成        \n",
    "    WebDriverWait(browser, 1000).until(            \n",
    "        EC.presence_of_all_elements_located(                \n",
    "            (By.CLASS_NAME, 'c-company__body-item-more')            \n",
    "        )        )       \n",
    "# 获取加载更多按钮        \n",
    "Btn = browser.find_element_by_class_name('c-company__body-item-more')        \n",
    "# 显示等待该信息加载完成        \n",
    "WebDriverWait(browser, 1000).until(            \n",
    "    EC.presence_of_all_elements_located(                \n",
    "        (By.XPATH, '//div[@id=\"searchlist_div\"]/div[{}]/div[@]'.format(2*count-1))            )        )        \n",
    "# 获取在div标签的信息，其中format(2*count-1)是因为加载的时候有显示多少条        # 简单的说就是这些div的信息都是奇数        \n",
    "divs = browser.find_elements_by_xpath('//div[@id=\"searchlist_div\"]/div[{}]/div[@]'.format(2*count-1))        \n",
    "# 遍历循环        \n",
    "for div in divs:            \n",
    "    # 获取文献的题目            \n",
    "    name = div.find_element_by_class_name('c-company__body-title').text            \n",
    "    # 获取文献的作者            \n",
    "    author = div.find_element_by_class_name('c-company__body-author').text            \n",
    "    # 获取文献的摘要            \n",
    "    content = div.find_element_by_class_name('c-company__body-content').text            \n",
    "    # 获取文献的来源和日期、文献类型等            \n",
    "    text = div.find_element_by_class_name('c-company__body-name').text.split()            \n",
    "    if (len(text) == 3 and text[-1] == '优先') or len(text) == 2:                \n",
    "        # 来源                \n",
    "        source = text[0]                \n",
    "        # 日期                \n",
    "        datetime = text[1]                \n",
    "        # 文献类型                \n",
    "        literature_type = None            \n",
    "        else:                \n",
    "            source = text[0]                \n",
    "            datetime = text[2]                \n",
    "            literature_type = text[1]            \n",
    "            # 获取下载数和被引数            \n",
    "            temp = div.find_element_by_class_name('c-company__body-info').text.split()            \n",
    "            # 下载数            \n",
    "            download = temp[0].split('：')[-1]            \n",
    "            # 被引数            \n",
    "            cite = temp[1].split('：')[-1]            \n",
    "            # 声明一个字典存储数据            \n",
    "            data_dict = {}            \n",
    "            data_dict['name'] = name            \n",
    "            data_dict['author'] = author            \n",
    "            data_dict['content'] = content            \n",
    "            data_dict['source'] = source            \n",
    "            data_dict['datetime'] = datetime            \n",
    "            data_dict['literature_type'] = literature_type            \n",
    "            data_dict['download'] = download            \n",
    "            data_dict['cite'] = cite            \n",
    "            data_list.append(data_dict)            \n",
    "            print(data_dict)        \n",
    "            # 如果Btn按钮(就是加载更多这个按钮)没有找到(就是已经到底了)，就退出        \n",
    "            if not Btn:            \n",
    "                break        \n",
    "                else:            \n",
    "                    Btn.click()        \n",
    "                    # 如果到了爬取的页数就退出        \n",
    "                    if count == page:            \n",
    "                        break        \n",
    "                        count += 1        \n",
    "                        # 延迟两秒，我们不是在攻击服务器        \n",
    "                        time.sleep(2)def main():    \n",
    "                            start_spider(eval(input('请输入要爬取的页数(如果需要全部爬取请输入0)：')))    \n",
    "                            #将数据写入json文件中    \n",
    "                            with open('data_json.json', 'a+', encoding='utf-8') as f:        \n",
    "                                json.dump(data_list, f, ensure_ascii=False, indent=4)    \n",
    "                                print('json文件写入完成')    \n",
    "                                # 将数据写入csv文件    \n",
    "                                with open('data_csv.csv', 'w', encoding='utf-8', newline='') as f:        \n",
    "                                    # 表头        \n",
    "                                    title = data_list[0].keys()        \n",
    "                                    # 声明writer对象        \n",
    "                                    writer = csv.DictWriter(f, title)        \n",
    "                                    # 写入表头        \n",
    "                                    writer.writeheader()        \n",
    "                                    # 批量写入数据        \n",
    "                                    writer.writerows(data_list)    \n",
    "                                    print('csv文件写入完成')if __name__ == '__main__':    main()    \n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "handmade-block",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6]\n"
     ]
    }
   ],
   "source": [
    "# 导出refworks文件（.txt）和下载文章\n",
    "# 每次全选不能超过500篇，分2次进行\n",
    "\n",
    "pages = list(range(1,7))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "married-attribute",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 返回第一页\n",
    "driver.find_element_by_id('total').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "artificial-zealand",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 清除选中\n",
    "driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "bored-deputy",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选中页面50篇 —> 翻页\n",
    "def process_choose(pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        time.sleep(30+20*random())\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "injured-voluntary",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "portable-fortune",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析 \n",
    "driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "union-objective",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出文献\n",
    "driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "published-treasurer",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击Refworks\n",
    "driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "ordinary-giving",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-147163A690285E5E38ED44B4585477B4',\n",
       " 'CDwindow-1EB28E536114C04C8AB041AE8ED2E402',\n",
       " 'CDwindow-52C5F98F2ED8EA55F8054029D17F5CEF']"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "id": "cheap-repair",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-144-520070efe65b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "id": "golden-blame",
   "metadata": {},
   "outputs": [
    {
     "ename": "NoSuchElementException",
     "evalue": "Message: no such element: Unable to locate element: {\"method\":\"xpath\",\"selector\":\"//i[@class=\"icon icon-export\"]\"}\n  (Session info: chrome=91.0.4472.106)\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNoSuchElementException\u001b[0m                    Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-148-d0cdf9c29823>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# 导出 .txt文件\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'//i[@class=\"icon icon-export\"]'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mfind_element_by_xpath\u001b[0;34m(self, xpath)\u001b[0m\n\u001b[1;32m    392\u001b[0m             \u001b[0melement\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'//div/td[1]'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    393\u001b[0m         \"\"\"\n\u001b[0;32m--> 394\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_element\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mXPATH\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mxpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    395\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    396\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mfind_elements_by_xpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mfind_element\u001b[0;34m(self, by, value)\u001b[0m\n\u001b[1;32m    974\u001b[0m                 \u001b[0mby\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCSS_SELECTOR\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    975\u001b[0m                 \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'[name=\"%s\"]'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 976\u001b[0;31m         return self.execute(Command.FIND_ELEMENT, {\n\u001b[0m\u001b[1;32m    977\u001b[0m             \u001b[0;34m'using'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    978\u001b[0m             'value': value})['value']\n",
      "\u001b[0;32m~/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[0;34m(self, driver_command, params)\u001b[0m\n\u001b[1;32m    319\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    320\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    322\u001b[0m             response['value'] = self._unwrap_value(\n\u001b[1;32m    323\u001b[0m                 response.get('value', None))\n",
      "\u001b[0;32m~/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[0;34m(self, response)\u001b[0m\n\u001b[1;32m    240\u001b[0m                 \u001b[0malert_text\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'alert'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'text'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    241\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 242\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    244\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNoSuchElementException\u001b[0m: Message: no such element: Unable to locate element: {\"method\":\"xpath\",\"selector\":\"//i[@class=\"icon icon-export\"]\"}\n  (Session info: chrome=91.0.4472.106)\n"
     ]
    }
   ],
   "source": [
    "# 导出 .txt文件\n",
    "driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "id": "affected-illness",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-146-0188c2a7ff70>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "id": "corporate-stamp",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 批量下载\n",
    "driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "id": "daily-suicide",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-149-1f3bb34cc9cb>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "id": "advanced-server",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下载所选文献（200篇）\n",
    "driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "id": "cleared-drive",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-151-0188c2a7ff70>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "id": "brave-insured",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 清除选择\n",
    "driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "id": "balanced-listing",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_id('PageNext').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "smaller-reviewer",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[7, 8, 9, 10, 11, 12]\n"
     ]
    }
   ],
   "source": [
    "# 第二轮下载\n",
    "pages = list(range(7,13))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "id": "shared-gibraltar",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7\t8\t9\t10\t11\t12\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "circular-malpractice",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析 \n",
    "driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "id": "italic-script",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出文献\n",
    "driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "id": "brazilian-tissue",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击Refworks\n",
    "driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "id": "dynamic-slope",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-147163A690285E5E38ED44B4585477B4',\n",
       " 'CDwindow-1EB28E536114C04C8AB041AE8ED2E402',\n",
       " 'CDwindow-60488D165355AA75BB570E8D84BCC423']"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "id": "bored-marketplace",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-160-520070efe65b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "id": "fallen-conviction",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出 .txt文件\n",
    "driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "id": "proprietary-marsh",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-162-0188c2a7ff70>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "id": "enclosed-benjamin",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 批量下载\n",
    "driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "id": "electoral-effects",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-164-1f3bb34cc9cb>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "id": "entire-analyst",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下载所选文献（200篇）\n",
    "driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hybrid-sellers",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
